MatMulFusion

矩阵乘法融合(可选偏置和激活),计算:

\[C = \operatorname{act}(A \times B + \text{bias})\]

其中 A 形状为 \(M\times K\)B\(K\times N\)C 与可选的 bias\(M\times N\)。 激活 act 支持:

  • 0: 无激活(Identity)

  • 1: ReLU

  • 2: ReLU6

输入:
  • A - 输入矩阵 A(行优先,连续存储)。大小 M×K。

  • B - 输入矩阵 B(行优先,连续存储)。大小 K×N。

  • bias - 偏置矩阵(可为 NULL)。当 bias_broadcast=1 时大小为 N,否则为 M×N。

  • params - 参数打包成数组(共7个元素):
    • params[0] (M) - 维度参数。

    • params[1] (N) - 维度参数。

    • params[2] (K) - 维度参数。

    • params[3] (activation_type) - 激活类型,取值 {0,1,2}。

    • params[4] (A_transpose) - A矩阵是否转置,取值 {0,1}。

    • params[5] (B_transpose) - B矩阵是否转置,取值 {0,1}。

    • params[6] (bias_broadcast) - 偏置是否广播,取值 {0,1}。

  • core_mask(可选) - 核掩码(仅适用于共享存储版本)。

输出:
  • C - 输出矩阵(行优先,大小 M×N)。

支持平台:

FT78NE MT7004

备注

  • FT78NE 支持int8, int16, int32, fp32, fp64, cplx64, cplx128

  • MT7004 支持fp16, fp32, int16, int32, cplx64

  • 复数类型的激活逐分量应用于实部与虚部

  • 请确保输入按行优先连续布局,且不发生类型范围溢出;int8/int16/int32 计算未做饱和裁剪

  • 转置操作通过参数控制,无需预先转置矩阵

共享存储版本:

void i8_matmul_fusion_s(int8_t *A, int8_t *B, int8_t *C, int8_t *bias, long long *params, int core_mask)
void i16_matmul_fusion_s(int16_t *A, int16_t *B, int16_t *C, int16_t *bias, long long *params, int core_mask)
void i32_matmul_fusion_s(int *A, int *B, int *C, int *bias, long long *params, int core_mask)
void hp_matmul_fusion_s(half *A, half *B, half *C, half *bias, long long *params, int core_mask)
void fp_matmul_fusion_s(float *A, float *B, float *C, float *bias, long long *params, int core_mask)
void dp_matmul_fusion_s(double *A, double *B, double *C, double *bias, long long *params, int core_mask)
void c64_matmul_fusion_s(float *A, float *B, float *C, float *bias, long long *params, int core_mask)
void c128_matmul_fusion_s(double *A, double *B, double *C, double *bias, long long *params, int core_mask)

C调用示例:

 1#include <stdio.h>
 2#include <stdbool.h>
 3
 4int main(int argc, char* argv[]) {
 5    float* A_ref = (float*)0x90000000;
 6    float* B_ref = (float*)0x91000000;
 7    float* C_ref = (float*)0x92000000;
 8    float* bias_ref = (float*)0x93000000;
 9
10    float* C_multi = (float*)0x95000000;
11    int core_mask = 0b1111; // 使用4核
12
13    int M = 64;
14    int N = 64;
15    int K = 64;
16
17    bool bias_broadcast = true;
18    bool A_transpose = false;
19    bool B_transpose = true;
20
21    // Initialize test data (core 0 only)
22    if (coreid == 0) {
23
24        // Initialize A, B, bias with small values
25        for (int i = 0; i < M * K; ++i) {
26            A_ref[i] = (float)(i % 10) * 0.1f;
27        }
28        for (int i = 0; i < K * N; ++i) {
29            B_ref[i] = (float)(i % 10) * 0.1f;
30        }
31        for (int i = 0; i < M * N; ++i) {
32            C_ref[i] = 0.0f;
33            C_multi[i] = 0.0f;
34            bias_ref[i] = (float)(i % 5) * 0.01f;
35        }
36    }
37    long long params[7];
38    params[0] = (long long)M;
39    params[1] = (long long)N;
40    params[2] = (long long)K;
41    params[3] = (long long)ACTIVATION_RELU;
42    params[4] = (long long)A_transpose;
43    params[5] = (long long)B_transpose;
44    params[6] = (long long)bias_broadcast;
45
46    fp_matmul_fusion_s(A_ref, B_ref, C_multi, bias_ref, params, core_mask);
47    return 0;
48}

私有存储版本:

void i8_matmul_fusion_p(int8_t *A, int8_t *B, int8_t *C, int8_t *bias, long long *params)
void i16_matmul_fusion_p(int16_t *A, int16_t *B, int16_t *C, int16_t *bias, long long *params)
void i32_matmul_fusion_p(int *A, int *B, int *C, int *bias, long long *params)
void hp_matmul_fusion_p(half *A, half *B, half *C, half *bias, long long *params)
void fp_matmul_fusion_p(float *A, float *B, float *C, float *bias, long long *params)
void dp_matmul_fusion_p(double *A, double *B, double *C, double *bias, long long *params)
void c64_matmul_fusion_p(float *A, float *B, float *C, float *bias, long long *params)
void c128_matmul_fusion_p(double *A, double *B, double *C, double *bias, long long *params)

C调用示例:

 1#include <stdio.h>
 2#include <stdbool.h>
 3
 4int main(int argc, char* argv[]) {
 5    float* A_ref = (float*)0x10010000;
 6    float* B_ref = (float*)0x10020000;
 7    float* C_ref = (float*)0x10030000;
 8    float* bias_ref = (float*)0x10040000;
 9
10    float* C_single = (float*)0x10050000;
11
12    int M = 8;
13    int N = 8;
14    int K = 8;
15
16    bool bias_broadcast = true;
17    bool A_transpose = true;
18    bool B_transpose = false;
19
20    // Initialize A, B, bias with small values
21    for (int i = 0; i < M * K; ++i) {
22        A_ref[i] = (float)(i % 10) * 0.1f;
23    }
24    for (int i = 0; i < K * N; ++i) {
25        B_ref[i] = (float)(i % 10) * 0.1f;
26    }
27    for (int i = 0; i < M * N; ++i) {
28        C_ref[i] = 0.0f;
29        C_single[i] = 0.0f;
30        bias_ref[i] = (float)(i % 5) * 0.01f;
31    }
32
33    long long params[7];
34
35    params[0] = (long long)M;
36    params[1] = (long long)N;
37    params[2] = (long long)K;
38    params[3] = (long long)ACTIVATION_RELU;
39    params[4] = (long long)A_transpose;
40    params[5] = (long long)B_transpose;
41    params[6] = (long long)bias_broadcast;
42
43    fp_matmul_fusion_p(A_ref, B_ref, C_single, bias_ref, params);
44    return 0;
45}